{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#%reset -f\n",
    "%matplotlib inline\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import feather\n",
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import Data Frame and create raw X and y arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time to read data via feather: 5.145771741867065\n"
     ]
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "if not os.path.exists(\"ipums.feather\"):\n",
    "    if not os.path.exists(\"ipums.csv\"):\n",
    "        !gunzip -c ipums.csv.gz > ipums.csv\n",
    "    !R -f ./ipums_feather.R \n",
    "df = feather.read_dataframe(\"ipums.feather\")\n",
    "t1 = time.time()\n",
    "print(\"Time to read data via feather: %r\" % (t1-t0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target = df.columns[-1] ## last column is the response\n",
    "cols = [c for c in df.columns if c != target]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(55776, 9732)\n",
      "(55776,)\n"
     ]
    }
   ],
   "source": [
    "X = np.array(df.ix[:,cols], dtype='float64', order='C')\n",
    "y = np.array(df[target].values, dtype='float64')\n",
    "print(X.shape)\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = None ## free mem\n",
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TASK:\n",
    "* Elastic Net Regression for Gaussian distribution\n",
    "* predict last column INCEARN from all other columns\n",
    "* alpha=0.5 (or ideally, 8 different values)\n",
    "* lambda search (100 lambdas)\n",
    "* Perform 5-fold cross-validation\n",
    "* Compute validation RMSE\n",
    "* Note: assume the dataset is dense, even though it isn't in this instance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vowpal Wabbit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if not os.path.exists(\"train.vw\"):\n",
    "    vw = np.concatenate([y.reshape(y.shape[0],1),X], axis=1)\n",
    "    np.savetxt(\"train.vw\", vw, delimiter=\" \", fmt=\"%g\")\n",
    "    !sed -i -e 's/ / |/' train.vw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "t0 = time.time()\n",
    "## performs OOB validation\n",
    "!./vw-8.20170116 -d train.vw 2>&1 | tee log.vw #--l1 1 --l2 1 --ftrl --passes 10 --cache_file cache.vw\n",
    "t1 = time.time()\n",
    "print(\"Time to run one model through Vowpal Wabbit: %r\" % (t1-t0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(\"RMSE\")\n",
    "## TODO - check whether 'average loss' is the right info\n",
    "!echo \"sqrt(`grep \"average loss\" log.vw | awk '{print $4}'`)\" | bc -l"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split data into train/valid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "44620\n"
     ]
    }
   ],
   "source": [
    "H = (int)(0.8*X.shape[0])\n",
    "print(H)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainX = X[:H,:]\n",
    "trainY = y[:H]\n",
    "validX = X[H:,:]\n",
    "validY = y[H:]\n",
    "X = None\n",
    "y = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.0, 0.14285714285714285, 0.2857142857142857, 0.42857142857142855, 0.5714285714285714, 0.7142857142857143, 0.8571428571428571, 1.0]\n"
     ]
    }
   ],
   "source": [
    "alphas = [r/7. for r in range(0,8)] ## final requirement for demo\n",
    "#alphas = [0.5] ## faster for testing\n",
    "print(alphas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scikit-Learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## TODO"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TensorFlow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.python.framework import ops\n",
    "\n",
    "ops.reset_default_graph()\n",
    "# Create graph\n",
    "sess = tf.Session()\n",
    "\n",
    "# Declare batch size\n",
    "batch_size = 32\n",
    "\n",
    "#Declare epochs\n",
    "epochs = 1\n",
    "\n",
    "# Initialize placeholders\n",
    "x_data = tf.placeholder(shape=[None, trainX.shape[1]], dtype=tf.float32)\n",
    "y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)\n",
    "\n",
    "# Create variables for linear regression\n",
    "A = tf.Variable(tf.random_normal(shape=[trainX.shape[1],1], seed = 42), name = \"A\")\n",
    "b = tf.Variable(np.mean(trainY, dtype = np.float32), name = \"b\")\n",
    "\n",
    "# Declare model operations\n",
    "model_output = tf.add(tf.matmul(x_data, A), b)\n",
    "\n",
    "# Declare the elastic net loss function\n",
    "elastic_param1 = tf.placeholder(tf.float32, shape=None, name=\"e1\")\n",
    "elastic_param2 = tf.placeholder(tf.float32, shape=None, name=\"e2\")\n",
    "lambda_ = tf.placeholder(tf.float32, shape=None, name=\"lambda\")\n",
    "l1_a_loss = tf.reduce_mean(tf.abs(A))\n",
    "l2_a_loss = tf.reduce_mean(tf.square(A))\n",
    "e1_term = tf.multiply(elastic_param1, l1_a_loss)\n",
    "e2_term = tf.multiply(elastic_param2, l2_a_loss)\n",
    "loss = tf.expand_dims(\n",
    "    tf.add(tf.reduce_mean(tf.square(y_target - model_output)),tf.multiply(lambda_,(tf.add(e1_term,e2_term)))), \n",
    "    0)\n",
    "\n",
    "# Declare optimizer\n",
    "my_opt = tf.train.GradientDescentOptimizer(0.01)\n",
    "train_step = my_opt.minimize(loss)\n",
    "\n",
    "np.random.seed(42)\n",
    "def batch(iterable, n=1):\n",
    "    l = len(iterable)\n",
    "    for ndx in range(0, l, n):\n",
    "        yield iterable[ndx:min(ndx + n, l)]\n",
    "        \n",
    "# Training loop\n",
    "init = tf.global_variables_initializer()\n",
    "sess.run(init)\n",
    "loss_vec = []\n",
    "lambdas = [5.282175]\n",
    "#alphas = [0.666667]\n",
    "t0 = time.time()\n",
    "for l in np.sort(lambdas)[::-1]:\n",
    "    for a in alphas:\n",
    "        # Initialize variables\n",
    "        #sess.run(init)\n",
    "        print(\"Lambda:\",l,\"Alpha:\",a)\n",
    "        for i in range(epochs):\n",
    "            indx = list(range(len(trainX)))\n",
    "            np.random.shuffle(indx)\n",
    "            \n",
    "            for rand_index in batch(indx, batch_size):\n",
    "                rand_x = trainX[rand_index,:]\n",
    "                rand_y = np.transpose([trainY[rand_index]])\n",
    "                feed_dict = {\n",
    "                  x_data: rand_x, y_target: rand_y,\n",
    "                  elastic_param1: a, elastic_param2: (1. - a) / 2.0, \n",
    "                  lambda_: l,\n",
    "                }\n",
    "                sess.run(train_step, feed_dict=feed_dict)\n",
    "print(\"Time for run through all alphas: \", time.time() - t0)\n",
    "from sklearn.metrics import mean_squared_error\n",
    "preds = np.dot(validX, sess.run(A)) + sess.run(b)\n",
    "print(\"RMSE:\",np.sqrt(mean_squared_error(validY, preds)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### GLMNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings\n",
    "import sys\n",
    "sys.path.insert(0, \"./glmnet_python/lib\")\n",
    "from glmnet import glmnet\n",
    "from glmnetPlot import glmnetPlot \n",
    "from glmnetPrint import glmnetPrint\n",
    "from glmnetCoef import glmnetCoef\n",
    "from glmnetPredict import glmnetPredict\n",
    "from cvglmnet import cvglmnet\n",
    "from cvglmnetCoef import cvglmnetCoef\n",
    "from cvglmnetPlot import cvglmnetPlot\n",
    "from cvglmnetPredict import cvglmnetPredict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainXscipy = scipy.array(trainX, dtype='float64')\n",
    "trainYscipy = scipy.array(trainY, dtype='float64')\n",
    "validXscipy = scipy.array(validX, dtype='float64')\n",
    "validYscipy = scipy.array(validY, dtype='float64')\n",
    "trainX = None ## free mem\n",
    "trainY = None\n",
    "validX = None\n",
    "validY = None\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def run_alpha(a):\n",
    "    t0 = time.time()\n",
    "    fit = cvglmnet(nfolds=5, x=trainXscipy, y=trainYscipy, family=\"gaussian\", alpha=a, nlambda=100)\n",
    "    t1 = time.time()\n",
    "    results = pd.DataFrame(columns=('alpha','lambda','rmse'))\n",
    "    print(\"Time to train glmnet: %r\" % (t1-t0))\n",
    "    for l in fit['lambdau']:\n",
    "        glmpred = glmnetPredict(fit, validXscipy, ptype = 'response', s = scipy.float64([l])).reshape(-1)\n",
    "        rmse=np.sqrt(np.mean(np.square(glmpred[0] - validYscipy)))\n",
    "        print(str(l) + \" \" + str(rmse))\n",
    "        results = results.append([{'alpha': a, 'lambda': l, 'rmse':rmse}])\n",
    "    results.to_csv(\"results.glmnet.\" + str(a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-5947d3122890>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmultiprocessing\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mpool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPool\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_alpha\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malphas\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/home/arno/.pyenv/versions/3.6.0/lib/python3.6/multiprocessing/pool.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, func, iterable, chunksize)\u001b[0m\n\u001b[1;32m    258\u001b[0m         \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mthat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mreturned\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    259\u001b[0m         '''\n\u001b[0;32m--> 260\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapstar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    261\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    262\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mstarmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/arno/.pyenv/versions/3.6.0/lib/python3.6/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    601\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 602\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    603\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    604\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/arno/.pyenv/versions/3.6.0/lib/python3.6/multiprocessing/pool.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    597\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    598\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    601\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/arno/.pyenv/versions/3.6.0/lib/python3.6/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    549\u001b[0m             \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    550\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 551\u001b[0;31m                 \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    552\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    553\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/home/arno/.pyenv/versions/3.6.0/lib/python3.6/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    293\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m    \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    294\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m                 \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    296\u001b[0m                 \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    297\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import multiprocessing as mp\n",
    "pool = mp.Pool(8)\n",
    "zip(*pool.map(run_alpha, alphas))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#python glmnet - buggy?\n",
    "#if not os.path.exists(\"glmnet.cpu.txt\"):\n",
    "#    !cat results.glmnet.* | grep -v rmse | sed 's/,/ /g' | awk '{print $2, $3, $4}' > glmnet.cpu.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#ipums.R glmnet\n",
    "if not os.path.exists(\"glmnet.cpu.txt\"):\n",
    "    !cat results.glmnet.* | grep -v ^c | sed 's/,/ /g' | awk '{print $1, $2, $3}' > glmnet.cpu.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot solutions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#uncomment and refresh this cell for live updates\n",
    "#!grep validRMSE ~/pogs/examples/cpp/me0.6eb76ab.txt | awk '{print $6, $14, $20}' | sort -r -k3 > me0.6eb76ab.txt\n",
    "res = pd.read_csv(\"me0.6eb76ab.txt\", sep=\" \",header=None,names=['alpha','lambda','rmse'])\n",
    "best = res.ix[res['rmse']==np.min(res['rmse']),:]\n",
    "print(best)\n",
    "plt.scatter(np.log10(res['lambda']), res['alpha'], c=res['rmse'], cmap='jet', vmin=28500, vmax=42500)\n",
    "plt.colorbar()\n",
    "plt.annotate('o', xy=(np.log10(best['lambda']),best['alpha']), fontsize=50, horizontalalignment='center', verticalalignment='center')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "res = pd.read_csv(\"glmnet.cpu.txt\", sep=\" \",header=None,names=['alpha','lambda','rmse'])\n",
    "best = res.ix[res['rmse']==np.min(res['rmse']),:]\n",
    "print(best)\n",
    "plt.scatter(np.log10(res['lambda']), res['alpha'], c=res['rmse'], cmap='jet', vmin=28500, vmax=42500)\n",
    "plt.colorbar()\n",
    "plt.annotate('o', xy=(np.log10(best['lambda']),best['alpha']), fontsize=50, horizontalalignment='center', verticalalignment='center')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#glmnetPlot(fit, xvar = 'lambda', label = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### H2O"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "H2O session _sid_8d1f closed.\n"
     ]
    }
   ],
   "source": [
    "h2o.cluster().shutdown()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking whether there is an H2O instance running at http://localhost:54321..... not found.\n",
      "Attempting to start a local H2O server...\n",
      "  Java Version: java version \"1.8.0_101\"; Java(TM) SE Runtime Environment (build 1.8.0_101-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.101-b13, mixed mode)\n",
      "  Starting server from /home/arno/.pyenv/versions/3.6.0/envs/h2oai/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar\n",
      "  Ice root: /tmp/tmpisjtm8fi\n",
      "  JVM stdout: /tmp/tmpisjtm8fi/h2o_arno_started_from_python.out\n",
      "  JVM stderr: /tmp/tmpisjtm8fi/h2o_arno_started_from_python.err\n",
      "  Server is running at http://127.0.0.1:54321\n",
      "Connecting to H2O server at http://127.0.0.1:54321... successful.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime:</td>\n",
       "<td>04 secs</td></tr>\n",
       "<tr><td>H2O cluster version:</td>\n",
       "<td>3.11.0.99999</td></tr>\n",
       "<tr><td>H2O cluster version age:</td>\n",
       "<td>1 hour and 5 minutes </td></tr>\n",
       "<tr><td>H2O cluster name:</td>\n",
       "<td>H2O_from_python_arno_zcvtp4</td></tr>\n",
       "<tr><td>H2O cluster total nodes:</td>\n",
       "<td>1</td></tr>\n",
       "<tr><td>H2O cluster free memory:</td>\n",
       "<td>26.67 Gb</td></tr>\n",
       "<tr><td>H2O cluster total cores:</td>\n",
       "<td>40</td></tr>\n",
       "<tr><td>H2O cluster allowed cores:</td>\n",
       "<td>40</td></tr>\n",
       "<tr><td>H2O cluster status:</td>\n",
       "<td>accepting new members, healthy</td></tr>\n",
       "<tr><td>H2O connection url:</td>\n",
       "<td>http://127.0.0.1:54321</td></tr>\n",
       "<tr><td>H2O connection proxy:</td>\n",
       "<td>None</td></tr>\n",
       "<tr><td>H2O internal security:</td>\n",
       "<td>False</td></tr>\n",
       "<tr><td>Python version:</td>\n",
       "<td>3.6.0 final</td></tr></table></div>"
      ],
      "text/plain": [
       "--------------------------  ------------------------------\n",
       "H2O cluster uptime:         04 secs\n",
       "H2O cluster version:        3.11.0.99999\n",
       "H2O cluster version age:    1 hour and 5 minutes\n",
       "H2O cluster name:           H2O_from_python_arno_zcvtp4\n",
       "H2O cluster total nodes:    1\n",
       "H2O cluster free memory:    26.67 Gb\n",
       "H2O cluster total cores:    40\n",
       "H2O cluster allowed cores:  40\n",
       "H2O cluster status:         accepting new members, healthy\n",
       "H2O connection url:         http://127.0.0.1:54321\n",
       "H2O connection proxy:\n",
       "H2O internal security:      False\n",
       "Python version:             3.6.0 final\n",
       "--------------------------  ------------------------------"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import h2o\n",
    "h2o.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
      "Time to parse with H2O: 12.206703662872314\n"
     ]
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "df_hex = h2o.import_file(\"ipums.csv\")\n",
    "t1 = time.time()\n",
    "print(\"Time to parse with H2O: %r\" % (t1-t0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_hex = df_hex[:H,:]\n",
    "valid_hex = df_hex[H:,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "alpha: \n",
      " 0.0alpha: alpha: alpha:  alpha:  0.42857142857142855 alpha: 0.57142857142857140.14285714285714285\n",
      "alpha:   \n",
      " 0.71428571428571430.2857142857142857alpha: 0.8571428571428571\n",
      " \n",
      "\n",
      "\n",
      "1.0\n",
      "glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |glm Model Build progress: |███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100%\n",
      "███rmse:  31180.05771259923\n",
      "███████████| 100%\n",
      "██████| 100%█\n",
      "█rmse:  31180.8918314204\n",
      "█rmse:  31107.897823484684\n",
      "████████| 100%\n",
      "████| 100%\n",
      "█████| 100%\n",
      "rmse:  31079.638497346365\n",
      "rmse:  31139.010876402226\n",
      "rmse:  31144.755781090662\n",
      "█████| 100%\n",
      "rmse:  31119.359360224007\n",
      "████████████████████████| 100%\n",
      "rmse:  31547.283581642805\n",
      "Time to train H2O: 320.9371554851532\n"
     ]
    }
   ],
   "source": [
    "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
    "\n",
    "def run_alpha(args):\n",
    "    cols, a, target, train_hex_id, valid_hex_id = args\n",
    "    print(\"alpha: \", a)\n",
    "    h2oglm = H2OGeneralizedLinearEstimator(nfolds=5,family=\"gaussian\", alpha=a, lambda_search=True)\n",
    "    h2oglm.train(x=cols, y=target, training_frame=h2o.get_frame(train_hex_id))\n",
    "    print(\"rmse: \", str(h2oglm.model_performance(h2o.get_frame(valid_hex_id)).rmse()))\n",
    "    \n",
    "train_hex.refresh()\n",
    "valid_hex.refresh()\n",
    "import multiprocessing as mp\n",
    "pool = mp.Pool(8)\n",
    "t0 = time.time()\n",
    "work = ((cols, a, target, train_hex.frame_id, valid_hex.frame_id) for a in alphas)\n",
    "pool.map(run_alpha, work)\n",
    "t1 = time.time()\n",
    "print(\"Time to train H2O: %r\" % (t1-t0))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
